Note: bin metabat.75_sub.contigs got caught in the data and needs ot be removed. Looks contaminated in GC coverage plot
set up working space
rm(list=ls())
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df.all<-read.csv('Data/02_Metagenomics/mag_visualization/August2019_annotated_covstats_all_bins.csv')
head(df.all)
tail(df.all)
length(unique(df.all$name))
## [1] 126
df.all<-df.all[!df.all$name=='metabat.75_sub.contigs',]
#all bins
bins.all<-read.table('Data/02_Metagenomics/mag_visualization/all_bins.tsv')
head(bins.all)
colnames(bins.all)<-c('Name','Contig')
bin_Freq.all<-data.frame(table(bins.all$Name))
length(unique(bins.all$Name))
## [1] 126
bins.all<-bins.all[!bins.all$Name=='metabat.75_sub.contigs',]
#sanity check
length(unique(bins.all$Contig))
## [1] 84223
dim(bins.all)
## [1] 84223 2
#Normalize Avg_fold values to num contigs per bin
df.all$Total<-bin_Freq.all[match(df.all$name, bin_Freq.all$Var1),'Freq']
df.all$Norm_Avg_Fold<-df.all$Avg_fold/df.all$Total
#Add in habitat
lookup<-data.frame(libs=c('3847_A', '3847_B', '3847_C', '3847_D', '3847_E', '3847_F', '3847_G','3847_H', '3847_I'), habitat=rep(c('Out','Edge','In'),c(3,3,3)))
lookup
df.all$habitat<-lookup[match(df.all$library, lookup$libs),'habitat']
df.all$habitat<-factor(x = df.all$habitat, ordered = T, levels=c('In','Edge','Out'))
head(df.all)
summary(df.all$Length) #gives summary about contig lengths
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1001 2955 4364 6740 7534 369755
Calculate average coverage for each bin across all contigs per habitat and per sample
# reduce dataset to only look at contigs with 5000 bp
df.reduced.all<-df.all[df.all$Length > 5000,]
s.libs.all<-df.reduced.all %>% group_by(library,name) %>% summarise(mean=mean(Avg_fold),se=sd(Avg_fold)/sqrt(length(Avg_fold)), mean_norm_cov=mean(Norm_Avg_Fold))
head(s.libs.all)
Rename libraries
lib_lookup<-data.frame(library=unique(df.reduced.all$library), letter=c('A','B','C','D','E','F','G','H','I'))
df.reduced.all$lib2<-lib_lookup[match(df.reduced.all$library, lib_lookup$library),'letter']
df.reduced.all$lib2<-factor(df.reduced.all$lib2, ordered=T, levels=rev(c('A','B','C','D','E','F','G','H','I')))
s.libs.all$lib2<-lib_lookup[match(s.libs.all$library, lib_lookup$library),'letter']
in_edge_out_colors<-c('forestgreen','goldenrod1','lightblue2')
theme_new<- theme_bw() + theme(panel.grid.major = element_line(color='gray90'), panel.grid.minor = element_line(color='gray90'))
split into 25 plot slices
bins<-sort(unique(df.reduced.all$name))
ggplot() + geom_jitter(alpha=0.25,data=df.reduced.all[df.reduced.all$name %in% bins[1:25],], aes(x=lib2, y=log(Avg_fold+1), size=Length, group=paste(name,habitat,library), color=habitat)) +
geom_boxplot(data=s.libs.all[s.libs.all$name %in% bins[1:25],], aes(x=lib2, y=log(mean+1))) +
facet_wrap(name~., scales='free_x') + scale_color_manual(values = in_edge_out_colors) + theme_new + theme(axis.text.x = element_blank())
ggsave('Data/02_Metagenomics/mag_visualization/bin_visualization_avg_fold_slice1.pdf', width=20, height=20)
ggplot() + geom_jitter(alpha=0.25,data=df.reduced.all[df.reduced.all$name %in% bins[26:50],], aes(x=lib2, y=log(Avg_fold+1), size=Length, group=paste(name,habitat,library), color=habitat)) +
geom_boxplot(data=s.libs.all[s.libs.all$name %in% bins[26:50],], aes(x=lib2, y=log(mean+1))) +
facet_wrap(name~., scales='free_x') + scale_color_manual(values = in_edge_out_colors) + theme_new + theme(axis.text.x = element_blank())
ggsave('Data/02_Metagenomics/mag_visualization/bin_visualization_avg_fold_slice2.pdf', width=20, height=20)
ggplot() + geom_jitter(alpha=0.25,data=df.reduced.all[df.reduced.all$name %in% bins[51:75],], aes(x=lib2, y=log(Avg_fold+1), size=Length, group=paste(name,habitat,library), color=habitat)) +
geom_boxplot(data=s.libs.all[s.libs.all$name %in% bins[51:75],], aes(x=lib2, y=log(mean+1))) +
facet_wrap(name~., scales='free_x') + scale_color_manual(values = in_edge_out_colors) + theme_new + theme(axis.text.x = element_blank())
ggsave('Data/02_Metagenomics/mag_visualization/bin_visualization_avg_fold_slice3.pdf', width=20, height=20)
ggplot() + geom_jitter(alpha=0.25,data=df.reduced.all[df.reduced.all$name %in% bins[76:100],], aes(x=lib2, y=log(Avg_fold+1), size=Length, group=paste(name,habitat,library), color=habitat)) +
geom_boxplot(data=s.libs.all[s.libs.all$name %in% bins[76:100],], aes(x=lib2, y=log(mean+1))) +
facet_wrap(name~., scales='free_x') + scale_color_manual(values = in_edge_out_colors) + theme_new + theme(axis.text.x = element_blank())
ggsave('Data/02_Metagenomics/mag_visualization/bin_visualization_avg_fold_slice4.pdf', width=20, height=20)
ggplot() + geom_jitter(alpha=0.25,data=df.reduced.all[df.reduced.all$name %in% bins[101:126],], aes(x=lib2, y=log(Avg_fold+1), size=Length, group=paste(name,habitat,library), color=habitat)) +
geom_boxplot(data=s.libs.all[s.libs.all$name %in% bins[101:126],], aes(x=lib2, y=log(mean+1))) +
facet_wrap(name~., scales='free_x') + scale_color_manual(values = in_edge_out_colors) + theme_new + theme(axis.text.x = element_blank())
ggsave('Data/02_Metagenomics/mag_visualization/bin_visualization_avg_fold_slice5.pdf', width=20, height=20)
Identify bins that only occur in single samples
head(s.libs.all)
summary(s.libs.all$mean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1604 0.8401 1.8076 2.2550 64.3580
s.libs.all$presence<-ifelse(s.libs.all$mean>0.5, yes=T, no=F)
s.pres<-s.libs.all[s.libs.all$presence==T,]
num.libs.perbin<-s.pres %>% group_by(name) %>% count()
singletons<-num.libs.perbin[num.libs.perbin$n==1,]
singletons
Confirm with visualization
ggplot() + geom_jitter(alpha=0.25,data=df.reduced.all[df.reduced.all$name %in% singletons$name,], aes(x=lib2, y=log(Avg_fold+1), size=Length, group=paste(name,habitat,library), color=habitat)) +
geom_boxplot(data=s.libs.all[s.libs.all$name %in% singletons$name,], aes(x=lib2, y=log(mean+1))) +
facet_wrap(name~., scales='free_x') + scale_color_manual(values = in_edge_out_colors) + theme_new + theme(axis.text.x = element_blank())
remove<-c('72.contigs','metabat.229.contigs','metabat.295.contigs','metabat.372.contigs', 'metabat.52.contigs')
singletons<-singletons[!singletons$name %in% remove,]
ggplot() + geom_jitter(alpha=0.25,data=df.reduced.all[df.reduced.all$name %in% singletons$name,], aes(x=lib2, y=log(Avg_fold+1), size=Length, group=paste(name,habitat,library), color=habitat)) +
geom_boxplot(data=s.libs.all[s.libs.all$name %in% singletons$name,], aes(x=lib2, y=log(mean+1))) +
facet_wrap(name~., scales='free_x') + scale_color_manual(values = in_edge_out_colors) + theme_new + theme(axis.text.x = element_blank())
df.analysis<-df.reduced.all[!df.reduced.all$name %in% singletons$name,] #removes bins that only occur in a single library
df.mcov.analysis<-s.libs.all[!s.libs.all$name %in% singletons$name,]
For each bin, compare average coverage value of the contigs across replicates
set up data
habitat_lookup<-data.frame(letter=c('A','B','C','D','E','F','G','H','I'),habitat=rep(c('Out','Edge','In'),c(3,3,3)))
df.mcov.analysis$location<-habitat_lookup[match(df.mcov.analysis$lib2, habitat_lookup$letter),'habitat']
head(df.mcov.analysis)
vars<-unique(df.mcov.analysis$name); length(vars)
## [1] 119
preform initial model test and check for assumptions of normality
mod<-aov(log(mean) ~ location, data=df.mcov.analysis[df.mcov.analysis$name==vars[1],])
summary(mod)
## Df Sum Sq Mean Sq F value Pr(>F)
## location 2 27.627 13.814 56.52 0.000128 ***
## Residuals 6 1.466 0.244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(2,2))
plot(mod)
models<-df.mcov.analysis %>% group_by(name) %>% do(mod = summary(aov(log(mean+1) ~ location, data = .)))
results<-data.frame(bin=models$name,
p.value=as.vector(unlist(models$mod)[names(unlist(models$mod))=='Pr(>F)1']),
SumSq=as.vector(unlist(models$mod)[names(unlist(models$mod))=='Sum Sq1']),
Df1=as.vector(unlist(models$mod)[names(unlist(models$mod))=='Df1']),
F_value=as.vector(unlist(models$mod)[names(unlist(models$mod))=='F value1']),
mean_sq=as.vector(unlist(models$mod)[names(unlist(models$mod))=="Mean Sq1"]))
results$p.adjust<-p.adjust(results$p.value, method = 'BH')
head(results)
sig.taxa<-results[which(results$p.adjust < 0.1),];length(sig.taxa$bin)
## [1] 91
ns.taxa<-results[which(results$p.adjust > 0.1),]
write.csv(results,'Data/02_Metagenomics/mag_visualization/Aug19_bin_abundance_model_results.csv')
Check for pairwise differences
sig.mods<-df.mcov.analysis[df.mcov.analysis$name %in% sig.taxa$bin, ]; dim(sig.mods)
## [1] 819 8
models<-sig.mods %>% group_by(name) %>% do(mod = aov(log(mean+1) ~ location, data = .))
head(models)
For all models with adjusted pvalue < 0.1, went through and calculated posthoc tests to determine which groups were different. All other bins, visualually assessed which habitat they occured in. Notes of results are in file called: Results.xlsx
library(agricolae)
tuk.mod<-data.frame()
for (i in 1:length(models$name)){
tmp<-models$mod[[i]]
hsd<-HSD.test(y = tmp, trt = 'location')
tmp2<-data.frame( bin=models$name[i],hsd$groups)
tmp2$location<-rownames(tmp2)
tuk.mod<-rbind(tuk.mod,tmp2)
}
head(tuk.mod); tail(tuk.mod)
write.csv(tuk.mod,'Data/02_Metagenomics/mag_visualization/Aug19_tukey_bin_results.csv')